The Office

We will analyse the office data

office_ratings <- 
  readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv')

office_ratings
## # A tibble: 188 x 6
##    season episode title             imdb_rating total_votes air_date  
##     <dbl>   <dbl> <chr>                   <dbl>       <dbl> <date>    
##  1      1       1 Pilot                     7.6        3706 2005-03-24
##  2      1       2 Diversity Day             8.3        3566 2005-03-29
##  3      1       3 Health Care               7.9        2983 2005-04-05
##  4      1       4 The Alliance              8.1        2886 2005-04-12
##  5      1       5 Basketball                8.4        3179 2005-04-19
##  6      1       6 Hot Girl                  7.8        2852 2005-04-26
##  7      2       1 The Dundies               8.7        3213 2005-09-20
##  8      2       2 Sexual Harassment         8.2        2736 2005-09-27
##  9      2       3 Office Olympics           8.4        2742 2005-10-04
## 10      2       4 The Fire                  8.4        2713 2005-10-11
## # … with 178 more rows

IMDB Ratings

We are going to explore the ratings from IMDB.

library(ggplot2)
library(dplyr)


office_ratings %>% 
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
  geom_boxplot() +
  labs(title = "Boxplots of IMDB Ratings per Season", x = "Season", y = "IMDB Rating") +
  theme_light() +
  geom_vline(xintercept = 7.5, linetype = 'dashed') +
  annotate("text", x = 8.5, y = 6, label = "Micheal Scott\n Leaves")

Summary Statstics

library(knitr)
office_ratings %>%
  group_by(season) %>%
  summarise(mean_rating = mean(imdb_rating),
            highest_rating = max(imdb_rating),
            lowest_rating = min(imdb_rating)) -> season_summary
kable(season_summary)
season mean_rating highest_rating lowest_rating
1 8.016667 8.4 7.6
2 8.436364 9.3 7.9
3 8.573913 9.3 8.0
4 8.600000 9.3 7.9
5 8.492308 9.6 8.1
6 8.219231 9.3 6.8
7 8.316667 9.7 7.5
8 7.666667 8.2 6.7
9 7.956522 9.7 7.1

Multiple Variables Plot

office_ratings
## # A tibble: 188 x 6
##    season episode title             imdb_rating total_votes air_date  
##     <dbl>   <dbl> <chr>                   <dbl>       <dbl> <date>    
##  1      1       1 Pilot                     7.6        3706 2005-03-24
##  2      1       2 Diversity Day             8.3        3566 2005-03-29
##  3      1       3 Health Care               7.9        2983 2005-04-05
##  4      1       4 The Alliance              8.1        2886 2005-04-12
##  5      1       5 Basketball                8.4        3179 2005-04-19
##  6      1       6 Hot Girl                  7.8        2852 2005-04-26
##  7      2       1 The Dundies               8.7        3213 2005-09-20
##  8      2       2 Sexual Harassment         8.2        2736 2005-09-27
##  9      2       3 Office Olympics           8.4        2742 2005-10-04
## 10      2       4 The Fire                  8.4        2713 2005-10-11
## # … with 178 more rows
office_ratings %>%
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
  geom_point() +
  labs(title = "Votes vs Rating", x = "IMDB Rating", y = "Total Votes")

library(plotly)

office_ratings %>%
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = imdb_rating, y = total_votes, colour = season, 
              text = paste0("S", season, ".E", episode, " ", title,
                           "<br>IMDB: ", imdb_rating, 
                           "<br>No. Votes: ", total_votes, 
                           "<br>Season: ", season))) +
  geom_point() +
  labs(title = "Votes vs Rating", x = "IMDB Rating", y = "Total Votes") -> votes_rating_plot

ggplotly(votes_rating_plot, tooltip = "text")